options(repos = c(CRAN = "https://cran.rstudio.com/"))
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
✔ broom        1.0.7     ✔ rsample      1.2.1
✔ dials        1.3.0     ✔ tune         1.2.1
✔ infer        1.0.7     ✔ workflows    1.1.4
✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
✔ parsnip      1.2.1     ✔ yardstick    1.3.1
✔ recipes      1.1.0     
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter()   masks stats::filter()
✖ recipes::fixed()  masks stringr::fixed()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()
• Use tidymodels_prefer() to resolve common conflicts.
library(powerjoin)
library(glue)
library(vip)

Attaching package: 'vip'

The following object is masked from 'package:utils':

    vi
library(baguette)
root  <- 'https://gdex.ucar.edu/dataset/camels/file'


download.file('https://gdex.ucar.edu/dataset/camels/file/camels_attributes_v2.0.pdf', 
              'camels_attributes_v2.0.pdf')


types <- c("clim", "geol", "soil", "topo", "vege", "hydro")


remote_files  <- glue('{root}/camels_{types}.txt')

local_files   <- glue('{root}/camels_{types}.txt')


walk2(remote_files, local_files, download.file, quiet = TRUE)
Warning in .f(.x[[i]], .y[[i]], ...): URL
https://gdex.ucar.edu/dataset/camels/file/camels_clim.txt: cannot open destfile
'https://gdex.ucar.edu/dataset/camels/file/camels_clim.txt', reason 'No such
file or directory'
Warning in .f(.x[[i]], .y[[i]], ...): download had nonzero exit status
Warning in .f(.x[[i]], .y[[i]], ...): URL
https://gdex.ucar.edu/dataset/camels/file/camels_geol.txt: cannot open destfile
'https://gdex.ucar.edu/dataset/camels/file/camels_geol.txt', reason 'No such
file or directory'
Warning in .f(.x[[i]], .y[[i]], ...): download had nonzero exit status
Warning in .f(.x[[i]], .y[[i]], ...): URL
https://gdex.ucar.edu/dataset/camels/file/camels_soil.txt: cannot open destfile
'https://gdex.ucar.edu/dataset/camels/file/camels_soil.txt', reason 'No such
file or directory'
Warning in .f(.x[[i]], .y[[i]], ...): download had nonzero exit status
Warning in .f(.x[[i]], .y[[i]], ...): URL
https://gdex.ucar.edu/dataset/camels/file/camels_topo.txt: cannot open destfile
'https://gdex.ucar.edu/dataset/camels/file/camels_topo.txt', reason 'No such
file or directory'
Warning in .f(.x[[i]], .y[[i]], ...): download had nonzero exit status
Warning in .f(.x[[i]], .y[[i]], ...): URL
https://gdex.ucar.edu/dataset/camels/file/camels_vege.txt: cannot open destfile
'https://gdex.ucar.edu/dataset/camels/file/camels_vege.txt', reason 'No such
file or directory'
Warning in .f(.x[[i]], .y[[i]], ...): download had nonzero exit status
Warning in .f(.x[[i]], .y[[i]], ...): URL
https://gdex.ucar.edu/dataset/camels/file/camels_hydro.txt: cannot open
destfile 'https://gdex.ucar.edu/dataset/camels/file/camels_hydro.txt', reason
'No such file or directory'
Warning in .f(.x[[i]], .y[[i]], ...): download had nonzero exit status
camels <- map(local_files, read_delim, show_col_types = FALSE) 


camels <- power_full_join(camels ,by = 'gauge_id')

##Question 1

#zero_q_freq represents the frequency of days when flow equals 0 mm/day. It is used to determine the number of days when there is no measurable flow in a river/stream of interest.
ggplot(data = camels, aes(x = gauge_lon, y = gauge_lat)) +
  borders("state", colour = "gray50") +
  geom_point(aes(color = q_mean)) +
  scale_color_gradient(low = "pink", high = "dodgerblue") +
  ggthemes::theme_map()

scale_color_manual(values = c("dodgerblue", "yellow", "pink")) #lets you pick your own colors.
<ggproto object: Class ScaleDiscrete, Scale, gg>
    aesthetics: colour
    axis_order: function
    break_info: function
    break_positions: function
    breaks: waiver
    call: call
    clone: function
    dimension: function
    drop: TRUE
    expand: waiver
    get_breaks: function
    get_breaks_minor: function
    get_labels: function
    get_limits: function
    get_transformation: function
    guide: legend
    is_discrete: function
    is_empty: function
    labels: waiver
    limits: NULL
    make_sec_title: function
    make_title: function
    map: function
    map_df: function
    n.breaks.cache: NULL
    na.translate: TRUE
    na.value: grey50
    name: waiver
    palette: function
    palette.cache: NULL
    position: left
    range: environment
    rescale: function
    reset: function
    train: function
    train_df: function
    transform: function
    transform_df: function
    super:  <ggproto object: Class ScaleDiscrete, Scale, gg>

##Question 2

library(ggplot2)
library(ggthemes)
library(ggpubr)

map_aridity <- ggplot(data = camels, aes(x = gauge_lon, y = gauge_lat)) +
  borders("state", colour = "gray50") +
  geom_point(aes(color = aridity)) +
  scale_color_gradient(low = "orange", high = "darkblue") + 
  ggthemes::theme_map() +
  labs(title = "Map of Aridity", color = "Aridity") +
  theme(legend.position = "right")


map_p_mean <- ggplot(data = camels, aes(x = gauge_lon, y = gauge_lat)) +
  borders("state", colour = "gray50") +
  geom_point(aes(color = p_mean)) +
  scale_color_gradient(low = "yellow", high = "darkblue") +
  ggthemes::theme_map() +
  labs(title = "Map of Precipitation (p_mean)", color = "Precipitation") +
  theme(legend.position = "right")

combined_map <- ggarrange(map_aridity, map_p_mean, ncol = 1, nrow = 2)

print(map_aridity)

print(map_p_mean)

print(combined_map)

camels |> 
  select(aridity, p_mean, q_mean) |> 
  drop_na() |> 
  cor()
           aridity     p_mean     q_mean
aridity  1.0000000 -0.7550090 -0.5817771
p_mean  -0.7550090  1.0000000  0.8865757
q_mean  -0.5817771  0.8865757  1.0000000
ggplot(camels, aes(x = aridity, y = p_mean)) +
  geom_point(aes(color = q_mean)) +
  geom_smooth(method = "lm", color = "red", linetype = 2) +
  scale_color_viridis_c() +
  theme_linedraw() + 
  theme(legend.position = "bottom") + 
  labs(title = "Aridity vs Rainfall vs Runnoff", 
       x = "Aridity", 
       y = "Rainfall",
       color = "Mean Flow")
`geom_smooth()` using formula = 'y ~ x'

ggplot(camels, aes(x = aridity, y = p_mean)) +
  geom_point(aes(color = q_mean)) +
  geom_smooth(method = "lm") +
  scale_color_viridis_c() +
  scale_x_log10() + 
  scale_y_log10() +
  theme_linedraw() +
  theme(legend.position = "bottom") + 
  labs(title = "Aridity vs Rainfall vs Runnoff", 
       x = "Aridity", 
       y = "Rainfall",
       color = "Mean Flow")
`geom_smooth()` using formula = 'y ~ x'

ggplot(camels, aes(x = aridity, y = p_mean)) +
  geom_point(aes(color = q_mean)) +
  geom_smooth(method = "lm") +
  scale_color_viridis_c(trans = "log") +
  scale_x_log10() + 
  scale_y_log10() +
  theme_linedraw() +
  theme(legend.position = "bottom",
        legend.key.width = unit(2.5, "cm"),
        legend.key.height = unit(.5, "cm")) + 
  labs(title = "Aridity vs Rainfall vs Runnoff", 
       x = "Aridity", 
       y = "Rainfall",
       color = "Mean Flow") 
`geom_smooth()` using formula = 'y ~ x'

set.seed(123)
camels <- camels |> 
  mutate(logQmean = log(q_mean))

camels_split <- initial_split(camels, prop = 0.8)
camels_train <- training(camels_split)
camels_test  <- testing(camels_split)

camels_cv <- vfold_cv(camels_train, v = 10)
rec <-  recipe(logQmean ~ aridity + p_mean, data = camels_train) %>%
  step_log(all_predictors()) %>%
  step_interact(terms = ~ aridity:p_mean) |> 
  step_naomit(all_predictors(), all_outcomes())
baked_data <- prep(rec, camels_train) |> 
  bake(new_data = NULL)

lm_base <- lm(logQmean ~ aridity * p_mean, data = baked_data)
summary(lm_base)

Call:
lm(formula = logQmean ~ aridity * p_mean, data = baked_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.91162 -0.21601 -0.00716  0.21230  2.85706 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -1.77586    0.16365 -10.852  < 2e-16 ***
aridity        -0.88397    0.16145  -5.475 6.75e-08 ***
p_mean          1.48438    0.15511   9.570  < 2e-16 ***
aridity:p_mean  0.10484    0.07198   1.457    0.146    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.5696 on 531 degrees of freedom
Multiple R-squared:  0.7697,    Adjusted R-squared:  0.7684 
F-statistic: 591.6 on 3 and 531 DF,  p-value: < 2.2e-16
summary(lm(logQmean ~ aridity + p_mean + aridity_x_p_mean, data = baked_data))

Call:
lm(formula = logQmean ~ aridity + p_mean + aridity_x_p_mean, 
    data = baked_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.91162 -0.21601 -0.00716  0.21230  2.85706 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)      -1.77586    0.16365 -10.852  < 2e-16 ***
aridity          -0.88397    0.16145  -5.475 6.75e-08 ***
p_mean            1.48438    0.15511   9.570  < 2e-16 ***
aridity_x_p_mean  0.10484    0.07198   1.457    0.146    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.5696 on 531 degrees of freedom
Multiple R-squared:  0.7697,    Adjusted R-squared:  0.7684 
F-statistic: 591.6 on 3 and 531 DF,  p-value: < 2.2e-16
test_data <-  bake(prep(rec), new_data = camels_test)
test_data$lm_pred <- predict(lm_base, newdata = test_data)
metrics(test_data, truth = logQmean, estimate = lm_pred)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.583
2 rsq     standard       0.742
3 mae     standard       0.390
ggplot(test_data, aes(x = logQmean, y = lm_pred, colour = aridity)) +
  scale_color_gradient2(low = "brown", mid = "orange", high = "darkgreen") +
  geom_point() +
  geom_abline(linetype = 2) +
  theme_linedraw() + 
  labs(title = "Linear Model: Observed vs Predicted",
       x = "Observed Log Mean Flow",
       y = "Predicted Log Mean Flow",
       color = "Aridity")

lm_model <- linear_reg() %>%
  set_engine("lm") %>%
  set_mode("regression")

lm_wf <- workflow() %>%
  add_recipe(rec) %>%
  add_model(lm_model) %>%
  fit(data = camels_train) 

summary(extract_fit_engine(lm_wf))$coefficients
                   Estimate Std. Error    t value     Pr(>|t|)
(Intercept)      -1.7758557 0.16364755 -10.851710 6.463654e-25
aridity          -0.8839738 0.16144589  -5.475357 6.745512e-08
p_mean            1.4843771 0.15511117   9.569762 4.022500e-20
aridity_x_p_mean  0.1048449 0.07198145   1.456555 1.458304e-01
summary(lm_base)$coefficients
                 Estimate Std. Error    t value     Pr(>|t|)
(Intercept)    -1.7758557 0.16364755 -10.851710 6.463654e-25
aridity        -0.8839738 0.16144589  -5.475357 6.745512e-08
p_mean          1.4843771 0.15511117   9.569762 4.022500e-20
aridity:p_mean  0.1048449 0.07198145   1.456555 1.458304e-01
lm_data <- augment(lm_wf, new_data = camels_test)
dim(lm_data)
[1] 135  61
metrics(lm_data, truth = logQmean, estimate = .pred)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.583
2 rsq     standard       0.742
3 mae     standard       0.390
ggplot(lm_data, aes(x = logQmean, y = .pred, colour = aridity)) +
  scale_color_viridis_c() +
  geom_point() +
  geom_abline() +
  theme_linedraw()

install.packages("ranger")

The downloaded binary packages are in
    /var/folders/9z/jhvcp6pd2jdgn7z7pd30zw1w0000gn/T//Rtmp8QmRpl/downloaded_packages
library(baguette)
library(tidymodels)

rf_model <- rand_forest() %>%
  set_engine("ranger", importance = "impurity") %>%
  set_mode("regression")

rf_wf <- workflow() %>%
  add_recipe(rec) %>%
  add_model(rf_model)  

rf_fit <- fit(rf_wf, data = camels_train)

print(rf_fit)
══ Workflow [trained] ══════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()

── Preprocessor ────────────────────────────────────────────────────────────────
3 Recipe Steps

• step_log()
• step_interact()
• step_naomit()

── Model ───────────────────────────────────────────────────────────────────────
Ranger result

Call:
 ranger::ranger(x = maybe_data_frame(x), y = y, importance = ~"impurity",      num.threads = 1, verbose = FALSE, seed = sample.int(10^5,          1)) 

Type:                             Regression 
Number of trees:                  500 
Sample size:                      535 
Number of independent variables:  3 
Mtry:                             1 
Target node size:                 5 
Variable importance mode:         impurity 
Splitrule:                        variance 
OOB prediction error (MSE):       0.3201147 
R squared (OOB):                  0.7715377 
rf_data <- augment(rf_fit, new_data = camels_test)
dim(rf_data)
[1] 135  60
metrics(rf_data, truth = logQmean, estimate = .pred)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.592
2 rsq     standard       0.736
3 mae     standard       0.367
ggplot(rf_data, aes(x = logQmean, y = .pred, colour = aridity)) +
  scale_color_viridis_c() +
  geom_point() +
  geom_abline() +
  theme_linedraw()

wf <- workflow_set(list(rec), list(lm_model, rf_model)) %>%
  workflow_map('fit_resamples', resamples = camels_cv) 

autoplot(wf)

rank_results(wf, rank_metric = "rsq", select_best = TRUE)
# A tibble: 4 × 9
  wflow_id          .config .metric  mean std_err     n preprocessor model  rank
  <chr>             <chr>   <chr>   <dbl>   <dbl> <int> <chr>        <chr> <int>
1 recipe_linear_reg Prepro… rmse    0.569  0.0260    10 recipe       line…     1
2 recipe_linear_reg Prepro… rsq     0.770  0.0223    10 recipe       line…     1
3 recipe_rand_fore… Prepro… rmse    0.565  0.0249    10 recipe       rand…     2
4 recipe_rand_fore… Prepro… rsq     0.769  0.0261    10 recipe       rand…     2

##Question 3

install.packages("xgboost")

The downloaded binary packages are in
    /var/folders/9z/jhvcp6pd2jdgn7z7pd30zw1w0000gn/T//Rtmp8QmRpl/downloaded_packages
library(tidymodels)
library(baguette)
library(tidyr)

xgboost_model <- boost_tree() %>%
  set_engine("xgboost") %>%
  set_mode("regression")

nn_model <- bag_mlp() %>%
  set_engine("nnet") %>%
  set_mode("regression")

wf_set <- workflow_set(
  list(rec), 
  list(lm_model, rf_model, xgboost_model, nn_model)
)

wf_set_results <- wf_set %>%
  workflow_map("fit_resamples", resamples = camels_cv)

autoplot(wf_set_results)

rank_results(wf_set_results, rank_metric = "rsq", select_best = TRUE)
# A tibble: 8 × 9
  wflow_id          .config .metric  mean std_err     n preprocessor model  rank
  <chr>             <chr>   <chr>   <dbl>   <dbl> <int> <chr>        <chr> <int>
1 recipe_bag_mlp    Prepro… rmse    0.541  0.0257    10 recipe       bag_…     1
2 recipe_bag_mlp    Prepro… rsq     0.792  0.0240    10 recipe       bag_…     1
3 recipe_rand_fore… Prepro… rmse    0.561  0.0256    10 recipe       rand…     2
4 recipe_rand_fore… Prepro… rsq     0.773  0.0262    10 recipe       rand…     2
5 recipe_linear_reg Prepro… rmse    0.569  0.0260    10 recipe       line…     3
6 recipe_linear_reg Prepro… rsq     0.770  0.0223    10 recipe       line…     3
7 recipe_boost_tree Prepro… rmse    0.600  0.0289    10 recipe       boos…     4
8 recipe_boost_tree Prepro… rsq     0.745  0.0268    10 recipe       boos…     4
#I would move forward with the Bagged MLP model because it is the best model according to both RMSE and R-squared values. It provides the most accurate predictions out of the four models but Linear Regression or Random Forest could be used as well due to falling just short of Bagged MLP.

##Build Your Own

install.packages("rsample")

The downloaded binary packages are in
    /var/folders/9z/jhvcp6pd2jdgn7z7pd30zw1w0000gn/T//Rtmp8QmRpl/downloaded_packages
library(rsample)

set.seed(123)

camels_split <- initial_split(camels, prop = 0.75)
camels_train <- training(camels_split)
camels_test  <- testing(camels_split)

camels_cv <- vfold_cv(camels_train, v = 10)
formula <- logQmean ~ aridity + p_mean + vege + topo + geol
library(tidyverse)
library(recipes)

rec <- recipe(logQmean ~ aridity + p_mean, data = camels_train) %>%
  step_log(all_predictors()) %>%
  step_interact(terms = ~ aridity:p_mean) %>%
  step_naomit(all_predictors(), all_outcomes())
library(tidymodels)

rf_model <- rand_forest() %>%
  set_engine("ranger", importance = "impurity") %>%
  set_mode("regression")

lm_model <- linear_reg() %>%
  set_engine("lm") %>%
  set_mode("regression")

xgboost_model <- boost_tree() %>%
  set_engine("xgboost") %>%
  set_mode("regression")
wf_set <- workflow_set(
  list(rec), 
  list(lm_model, rf_model, xgboost_model)
)

wf_set_results <- wf_set %>%
  workflow_map("fit_resamples", resamples = camels_cv)

autoplot(wf_set_results)

rank_results(wf_set_results, rank_metric = "rsq", select_best = TRUE)
# A tibble: 6 × 9
  wflow_id          .config .metric  mean std_err     n preprocessor model  rank
  <chr>             <chr>   <chr>   <dbl>   <dbl> <int> <chr>        <chr> <int>
1 recipe_rand_fore… Prepro… rmse    0.547  0.0250    10 recipe       rand…     1
2 recipe_rand_fore… Prepro… rsq     0.788  0.0178    10 recipe       rand…     1
3 recipe_linear_reg Prepro… rmse    0.574  0.0260    10 recipe       line…     2
4 recipe_linear_reg Prepro… rsq     0.769  0.0225    10 recipe       line…     2
5 recipe_boost_tree Prepro… rmse    0.580  0.0259    10 recipe       boos…     3
6 recipe_boost_tree Prepro… rsq     0.768  0.0200    10 recipe       boos…     3
best_model <- rf_model
best_wf <- workflow() %>%
  add_recipe(rec) %>%
  add_model(best_model) %>%
  fit(data = camels_train)

test_data <- bake(prep(rec), new_data = camels_test)
test_data$rf_pred <- predict(best_wf, new_data = test_data)$.pred
Warning in bake.step_log(step, new_data = new_data): NaNs produced
metrics(test_data, truth = logQmean, estimate = rf_pred)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.793
2 rsq     standard       0.525
3 mae     standard       0.597
ggplot(test_data, aes(x = logQmean, y = rf_pred, colour = aridity)) +
  scale_color_viridis_c() +
  geom_point() +
  geom_abline() +
  theme_linedraw() +
  labs(title = "Random Forest: Observed vs Predicted",
       x = "Observed Log Mean Flow",
       y = "Predicted Log Mean Flow",
       color = "Aridity")

library(tidymodels)

final_model <- workflow() %>%
  add_recipe(rec) %>%
  add_model(best_model) %>%
  fit(data = camels_train)

final_predictions <- augment(final_model, new_data = camels_test)

ggplot(final_predictions, aes(x = logQmean, y = .pred)) +
  geom_point() +
  geom_abline(linetype = 2) +
  theme_minimal() +
  labs(title = "Observed vs Predicted Log Mean Flow",
       x = "Observed", 
       y = "Predicted")